
def splitPaths():
    inputFileName = "/data1/team/weijiang/machine-learning-project-related/programs/polyIRToolkit/polyIRIndexer/clueWebOffcialIndexPathsALL.txt"
    inputFileHandler = open(inputFileName, "r")    
    outputFileBase = "/data1/team/weijiang/machine-learning-project-related/auxFiles/cluewebDataPaths/"
    outputFileName = "en0000"
    
    currentOutputFileHandler = open(outputFileBase + outputFileName, "w")
    
    for currentLine in inputFileHandler.readlines():
        print currentLine.strip()
        currentDiskName = currentLine.strip().split("/")[1]
        currentFolder = currentLine.strip().split("/")[2]
        currentSegment = currentLine.strip().split("/")[3]
        if outputFileName != currentSegment:
            outputFileName = currentSegment
            #close the previous outputFileHandler
            currentOutputFileHandler.close()
            #open a new outputFileHandler
            currentOutputFileHandler = open(outputFileBase + outputFileName, "w")
        currentOutputFileHandler.write(currentLine.strip() + "\n")
    
def buildThreadPathsMappingTable():
    outputFileName = "/data1/team/weijiang/machine-learning-project-related/auxFiles/threadPathMappingTable"
    outputFileHandler = open(outputFileName, "w")
    
    j = 0
    
    for index,i in enumerate( range(0,138) ):
        if i <= 133:
            outputFileHandler.write( "en" + "%04d" % i + "\n")
        else:
            outputFileHandler.write( "enwp" + "%02d" % j + "\n")
            j += 1
        
buildThreadPathsMappingTable()
print "Job Done!"